Setup

Load libraries

library(ggplot2)
library(tidyr)
library(dplyr)
library(reshape2)

# For ARIMA
library(astsa)
library(forecast)

# For SIR
library(deSolve)
library(dfoptim)

Import data

import_data <- function(file) {
    df <- read.csv(file, header = FALSE, skip = 1)
    locations <- as.matrix(df[,1])
    dates <- as.matrix(read.csv(file, header = FALSE, nrows = 1)[1,-1])
    df <- as.data.frame(t(df[,-1]))
    rownames(df) <- dates
    colnames(df) <- locations
    # print(sapply(df, class))
    return(df)
}

cases_intl      <- import_data("data/International/International_covid_cases_data.csv")
deaths_intl     <- import_data("data/International/International_covid_deaths_data.csv")
cases_LA        <- import_data("data/LA/LA_cities_covid_data.csv")
cases_US        <- import_data("data/US_Counties/US_county_covid_cases_data.csv")
deaths_US       <- import_data("data/US_Counties/US_county_covid_deaths_data.csv")

Plot Data

Total Cases Per Day

overall_df <- function(df) {
    return(data.frame(days=seq(0,length(rownames(df))-1), 
                      cases=rowSums(df[,-1])))
}

overall_plot <- function(df, title) {
    ggplot(data=df, aes(x=days, y=cases, group=1)) + 
        geom_line() +
        ggtitle(title)
}

cases_LA_overall <- overall_df(cases_LA)
cases_US_overall <- overall_df(cases_US)
cases_intl_overall <- overall_df(cases_intl)

overall_plot(cases_LA_overall, "Overall LA Cases")

overall_plot(cases_US_overall, "Overall US Cases")

overall_plot(cases_intl_overall, "Overall International Cases")

Locations Plotted Individually

plot_all <- function(df, title, legend = FALSE) {
    rownames(df) <- seq(0,length(rownames(df))-1)
    m <- melt(as.matrix(df))
    colnames(m) <- c("Days", "Location", "Count")
    ggplot(m, aes(x=Days, y=Count, color=Location)) +
        geom_line(show.legend = legend) +
        scale_colour_viridis_d() +
        ggtitle(title)
}

max_counts <- function(df) {
    rownames(df) <- seq(0,length(rownames(df))-1)
    m <- melt(as.matrix(df))
    colnames(m) <- c("Days", "Location", "Count")
    return(m %>% group_by(Location) %>% slice_max(n=1, order_by=Count, with_ties = FALSE) %>% ungroup())
}
plot_all(cases_intl, "International Cases")

cases_intl_max <- max_counts(cases_intl)
cases_intl_max <- cases_intl_max[order(cases_intl_max$Count, decreasing = TRUE),]
cases_intl_max
plot_all(cases_US, "US County Cases")

cases_US_max <- max_counts(cases_US)
cases_US_max <- cases_US_max[order(cases_US_max$Count, decreasing = TRUE),]
cases_US_max
plot_all(cases_LA, "LA Cases")

cases_LA_max <- max_counts(cases_LA)
cases_LA_max <- cases_LA_max[order(cases_LA_max$Count, decreasing = TRUE),]
cases_LA_max

Locations with Most COVID-19 Cases and Other Selected Plots

plot_all(select(cases_intl, cases_intl_max$Location[1:10]), "International Cases Top 10", legend=TRUE)

plot_all(select(cases_US, cases_US_max$Location[1:10]), "US County Cases Top 10", legend=TRUE)

# locations with nice curves
# plot_all(select(cases_US, `New York City, New York`, `Suffolk, New York`, `Cook, Illinois`), "US County Cases Selected Plots", legend = TRUE)

# Los Angeles
# plot_all(select(cases_US, `Los Angeles, California`), "Los Angeles, California")
plot_all(select(cases_LA, cases_LA_max$Location[1:10]), "LA Cases Top 10", legend=TRUE)

Analyze with ARIMA

arima_wrap <- function(df, location, percentage = 0.7, verbose = TRUE) {
    timeseries = unlist(as.list(select(df, location)))
    if(verbose) {
        plot.ts(timeseries)
        plot.ts(diff(timeseries))
        acf(diff(timeseries))
    }
    
    train_series <- timeseries[1:length(timeseries)*percentage]
    test_series <- timeseries[-1:-length(timeseries)*percentage]
    
    # parametrize model
    AutoArimaModel=auto.arima(train_series)
    AutoArimaModel
    
    # train
    futurVal <- forecast(AutoArimaModel,h=10, level=c(99.5)) 
    futurVal <- forecast(AutoArimaModel,h=50, level=c(80)) #adjust timesteps ahead, and confidence level
    
    checkresiduals(AutoArimaModel)
    plot(futurVal)
    return(futurVal)
}
futurSuffolk <- arima_wrap(cases_US, "Suffolk, New York")
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(location)` instead of `location` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,2,0)
## Q* = 95.378, df = 7, p-value < 2.2e-16
## 
## Model df: 3.   Total lags used: 10

futurLA <- arima_wrap(cases_LA_overall, "cases")

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,2,1)
## Q* = 22.686, df = 6, p-value = 0.0009089
## 
## Model df: 4.   Total lags used: 10

futurUS <- arima_wrap(cases_US_overall, "cases")

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,2,0)
## Q* = 120.98, df = 5, p-value < 2.2e-16
## 
## Model df: 5.   Total lags used: 10

futurIntl <- arima_wrap(cases_intl_overall, "cases")

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,2,2)
## Q* = 70.454, df = 6, p-value = 3.3e-13
## 
## Model df: 4.   Total lags used: 10

forecasts = c(2,7,30)
futur <- list(futurSuffolk$mean[forecasts], futurLA$mean[forecasts], futurUS$mean[forecasts], futurIntl$mean[forecasts])
futur <- data.frame(matrix(unlist(futur), nrow=length(futur), byrow=T),stringsAsFactors=FALSE)
rownames(futur) <- c("Suffolk, New York", "All LA Cities", "All US Counties", "All International")
colnames(futur) <- c("2 Days", "1 Week", "1 Month")
futur

Analyze with SIR

Infected <- cases_US$`Los Angeles, California`[-1:-25] # Suffolk, New York
Days <- seq(1,length(Infected))

#plot(Days, Infected)

SIR <- function(time, state, parameters) {
  par <- as.list(c(state, parameters))
  with(par, {
    dS <- -beta * I * S / N
    dI <- beta * I * S / N - gamma * I
    dR <- gamma * I
    list(c(dS, dI, dR))
  })
}

N <- 3.99e6 #1.477e6
init <- c(
  S = N - Infected[1],
  I = Infected[1],
  R = 0
)

# define a function to calculate the residual sum of squares
# (RSS), passing in parameters beta and gamma that are to be
# optimised for the best fit to the incidence data
RSS <- function(parameters) {
  names(parameters) <- c("beta", "gamma")
  out <- ode(y = init, times = Days, func = SIR, parms = parameters)
  fit <- out[, 3]
  sum((Infected - fit)^2)
}
# now find the values of beta and gamma that give the
# smallest RSS, which represents the best fit to the data.
# Start with values of 0.5 for each, and constrain them to
# the interval 0 to 1.0

lower = c(0, 0)
upper = c(1, 1)

Opt <- optim(c(0.5, 0.5),
  RSS,
  method = "L-BFGS-B",
  lower = lower,
  upper = upper
)
Opt$message
## [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
# Opt <-nmkb(c(0.5, 0.5), RSS, lower=lower, upper=upper)
# Opt$message

# library(Rvmmin)
# Opt <- Rvmmin(c(0.5, 0.5), RSS, lower=lower, upper=upper)
# Opt$message

Opt_par <- setNames(Opt$par, c("beta", "gamma"))
Opt_par
##      beta     gamma 
## 0.4079545 0.3115256
Opt_par[1] = 0.38
Opt_par[2] = 0.29
# get the fitted values from our SIR model
fitted_cumulative_incidence <- data.frame(ode(
  y = init, times = Days,
  func = SIR, parms = Opt_par
))

fitted_cumulative_incidence <- fitted_cumulative_incidence %>%
  mutate(
    cumulative_incident_cases = Infected
  )

fitted_cumulative_incidence %>%
  ggplot(aes(x = time)) +
  geom_line(aes(y = I), colour = "red") +
  geom_point(aes(y = cumulative_incident_cases), colour = "blue") +
  labs(
    y = "Cumulative incidence",
    title = "COVID-19 fitted vs observed cumulative incidence, Los Angeles, California",
    subtitle = "(Red = fitted from SIR model, blue = observed)"
  ) +
  theme_minimal()

R0 <- as.numeric(Opt_par[1] / Opt_par[2])
R0
## [1] 1.310345